R Markdown for Case Study 02

DDSAnalytics is an analytics company that specializes in talent management solutions for Fortune 100 companies. Talent management is defined as the iterative process of developing and retaining employees. It may include workforce planning, employee training programs, identifying high-potential employees and reducing/preventing voluntary employee turnover (attrition). To gain a competitive edge over its competition, DDSAnalytics is planning to leverage data science for talent management. The executive leadership has identified predicting employee turnover as its first application of data science for talent management. Before the business green lights the project, they have tasked your data science team to conduct an analysis of existing employee data.

You have been given a dataset (CaseStudy2-data.csv on AWS S3 in the ddsproject1 bucket) to do a data analysis to identify factors that lead to attrition. You should identify the top three factors that contribute to turnover (backed up by evidence provided by analysis). There may or may not be a need to create derived attributes/variables/features. The business is also interested in learning about any job role specific trends that may exist in the data set (e.g., “Data Scientists have the highest job satisfaction”). You can also provide any other interesting trends and observations from your analysis. The analysis should be backed up by robust experimentation and appropriate visualization. Experiments and analysis must be conducted in R. You will also be asked to build a model to predict attrition.

#include all Libraries required for the EDA

library(class)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(e1071)
library(naniar)
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2
## ──
## ✔ tibble  3.1.8     ✔ purrr   1.0.0
## ✔ tidyr   1.2.1     ✔ forcats 0.5.2
## ✔ readr   2.1.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(ROCit)
library(readxl)
library(knitr)

#Read the Case Study 2 files in .csv and .xlsx format

CS2 = read.csv("/Users/Banu/Documents/MSDS/Unit 14 and 15 Case Study 2/CaseStudy2-data.csv")

CS2Test1 = read.csv("/Users/Banu/Documents/MSDS/Unit 14 and 15 Case Study 2/CaseStudy2CompSet No Attrition.csv")

CS2Test2 = read_excel("/Users/Banu/Documents/MSDS/Unit 14 and 15 Case Study 2/CaseStudy2CompSet No Salary.xlsx")

#Review for any missing values

gg_miss_var(CS2) + ggtitle("Missing Values in Dataset")

gg_miss_var(CS2Test1) + ggtitle("Missing Values in Dataset")

#describe the datasets - Number of Row and Columns

dim(CS2)
## [1] 870  36
dim(CS2Test1)
## [1] 300  35

#describe the dataset’s columns

str(CS2)
## 'data.frame':    870 obs. of  36 variables:
##  $ ID                      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age                     : int  32 40 35 32 24 27 41 37 34 34 ...
##  $ Attrition               : chr  "No" "No" "No" "No" ...
##  $ BusinessTravel          : chr  "Travel_Rarely" "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" ...
##  $ DailyRate               : int  117 1308 200 801 567 294 1283 309 1333 653 ...
##  $ Department              : chr  "Sales" "Research & Development" "Research & Development" "Sales" ...
##  $ DistanceFromHome        : int  13 14 18 1 2 10 5 10 10 10 ...
##  $ Education               : int  4 3 2 4 1 2 5 4 4 4 ...
##  $ EducationField          : chr  "Life Sciences" "Medical" "Life Sciences" "Marketing" ...
##  $ EmployeeCount           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : int  859 1128 1412 2016 1646 733 1448 1105 1055 1597 ...
##  $ EnvironmentSatisfaction : int  2 3 3 3 1 4 2 4 3 4 ...
##  $ Gender                  : chr  "Male" "Male" "Male" "Female" ...
##  $ HourlyRate              : int  73 44 60 48 32 32 90 88 87 92 ...
##  $ JobInvolvement          : int  3 2 3 3 3 3 4 2 3 2 ...
##  $ JobLevel                : int  2 5 3 3 1 3 1 2 1 2 ...
##  $ JobRole                 : chr  "Sales Executive" "Research Director" "Manufacturing Director" "Sales Executive" ...
##  $ JobSatisfaction         : int  4 3 4 4 4 1 3 4 3 3 ...
##  $ MaritalStatus           : chr  "Divorced" "Single" "Single" "Married" ...
##  $ MonthlyIncome           : int  4403 19626 9362 10422 3760 8793 2127 6694 2220 5063 ...
##  $ MonthlyRate             : int  9250 17544 19944 24032 17218 4809 5561 24223 18410 15332 ...
##  $ NumCompaniesWorked      : int  2 1 2 1 1 1 2 2 1 1 ...
##  $ Over18                  : chr  "Y" "Y" "Y" "Y" ...
##  $ OverTime                : chr  "No" "No" "No" "No" ...
##  $ PercentSalaryHike       : int  11 14 11 19 13 21 12 14 19 14 ...
##  $ PerformanceRating       : int  3 3 3 3 3 4 3 3 3 3 ...
##  $ RelationshipSatisfaction: int  3 1 3 3 3 3 1 3 4 2 ...
##  $ StandardHours           : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : int  1 0 0 2 0 2 0 3 1 1 ...
##  $ TotalWorkingYears       : int  8 21 10 14 6 9 7 8 1 8 ...
##  $ TrainingTimesLastYear   : int  3 2 2 3 2 4 5 5 2 3 ...
##  $ WorkLifeBalance         : int  2 4 3 3 3 2 2 3 3 2 ...
##  $ YearsAtCompany          : int  5 20 2 14 6 9 4 1 1 8 ...
##  $ YearsInCurrentRole      : int  2 7 2 10 3 7 2 0 1 2 ...
##  $ YearsSinceLastPromotion : int  0 4 2 5 1 1 0 0 0 7 ...
##  $ YearsWithCurrManager    : int  3 9 2 7 3 7 3 0 0 7 ...
str(CS2Test1)
## 'data.frame':    300 obs. of  35 variables:
##  $ ID                      : int  1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 ...
##  $ Age                     : int  35 33 26 55 29 51 52 39 31 31 ...
##  $ BusinessTravel          : chr  "Travel_Rarely" "Travel_Rarely" "Travel_Rarely" "Travel_Rarely" ...
##  $ DailyRate               : int  750 147 1330 1311 1246 1456 585 1387 1062 534 ...
##  $ Department              : chr  "Research & Development" "Human Resources" "Research & Development" "Research & Development" ...
##  $ DistanceFromHome        : int  28 2 21 2 19 1 29 10 24 20 ...
##  $ Education               : int  3 3 3 3 3 4 4 5 3 3 ...
##  $ EducationField          : chr  "Life Sciences" "Human Resources" "Medical" "Life Sciences" ...
##  $ EmployeeCount           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : int  1596 1207 1107 505 1497 145 2019 1618 1252 587 ...
##  $ EnvironmentSatisfaction : int  2 2 1 3 3 1 1 2 3 1 ...
##  $ Gender                  : chr  "Male" "Male" "Male" "Female" ...
##  $ HourlyRate              : int  46 99 37 97 77 30 40 76 96 66 ...
##  $ JobInvolvement          : int  4 3 3 3 2 2 3 3 2 3 ...
##  $ JobLevel                : int  2 1 1 4 2 3 1 2 2 3 ...
##  $ JobRole                 : chr  "Laboratory Technician" "Human Resources" "Laboratory Technician" "Manager" ...
##  $ JobSatisfaction         : int  3 3 3 4 3 1 4 1 1 3 ...
##  $ MaritalStatus           : chr  "Married" "Married" "Divorced" "Single" ...
##  $ MonthlyIncome           : int  3407 3600 2377 16659 8620 7484 3482 5377 6812 9824 ...
##  $ MonthlyRate             : int  25348 8429 19373 23258 23757 25796 19788 3835 17198 22908 ...
##  $ NumCompaniesWorked      : int  1 1 1 2 1 3 2 2 1 3 ...
##  $ Over18                  : chr  "Y" "Y" "Y" "Y" ...
##  $ OverTime                : chr  "No" "No" "No" "Yes" ...
##  $ PercentSalaryHike       : int  17 13 20 13 14 20 15 13 19 12 ...
##  $ PerformanceRating       : int  3 3 4 3 3 4 3 3 3 3 ...
##  $ RelationshipSatisfaction: int  4 4 3 3 3 3 2 4 2 1 ...
##  $ StandardHours           : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : int  2 1 1 0 2 0 2 3 0 0 ...
##  $ TotalWorkingYears       : int  10 5 1 30 10 23 16 10 10 12 ...
##  $ TrainingTimesLastYear   : int  3 2 0 2 3 1 3 3 2 2 ...
##  $ WorkLifeBalance         : int  2 3 2 3 3 2 2 3 3 3 ...
##  $ YearsAtCompany          : int  10 5 1 5 10 13 9 7 10 1 ...
##  $ YearsInCurrentRole      : int  9 4 1 4 7 12 8 7 9 0 ...
##  $ YearsSinceLastPromotion : int  6 1 0 1 0 12 0 7 1 0 ...
##  $ YearsWithCurrManager    : int  8 4 0 2 4 8 0 7 8 0 ...

#Our Objective is to predict attrition, so with the given dataset, let’s see how response variable Attrition behaves with respect to other explanotary variables

#Plotting Attrition by Age

CS2 %>%  ggplot(aes(x = Age, fill = Attrition)) + geom_bar() + ggtitle("Distribution of Age and Attrition") + ylab("Attrition")

CS2 %>%  ggplot(aes(x = Age, fill = Attrition)) + geom_boxplot() + ggtitle("Distribution of Age and Attrition") + ylab("Attrition")

CS2 %>% filter(Attrition == "Yes") %>% ggplot(aes(x = Age, fill = Attrition)) + geom_bar() + ggtitle("Distribution of Age and Attrition") + ylab("Attrition")

#The bar graph does suggest that there is more attrition in the age around 30, with a median age of 33 years

#Plotting Attrition by DistanceFromHome

CS2 %>%  ggplot(aes(x = DistanceFromHome, fill = Attrition)) + geom_bar() + ggtitle("Distribution of DistanceFromHome and Attrition") + ylab("Attrition")

#Looking at the plot, Distance From Home is not significant in determining Attrition,

#Plotting Attrition by Daily, Hourly, Monthly Rate and Monthly Income

CS2 %>%  ggplot(aes(x = DailyRate, fill = Attrition)) + geom_boxplot() + ggtitle("Distribution of DailyRate and Attrition") + ylab("Attrition")

CS2 %>%  ggplot(aes(x = HourlyRate, fill = Attrition)) + geom_boxplot() + ggtitle("Distribution of HourlyRate and Attrition") + ylab("Attrition")

CS2 %>%  ggplot(aes(x = MonthlyRate, fill = Attrition)) + geom_boxplot() + ggtitle("Distribution of MonthlyRate and Attrition") + ylab("Attrition")

CS2 %>%  ggplot(aes(x = MonthlyIncome, fill = Attrition)) + geom_boxplot() + ggtitle("Distribution of MonthlyIncome and Attrition") + ylab("Attrition")

CS2 %>% group_by(Attrition) %>% summarise(median(DailyRate))
## # A tibble: 2 × 2
##   Attrition `median(DailyRate)`
##   <chr>                   <dbl>
## 1 No                       828.
## 2 Yes                      751
CS2 %>% group_by(Attrition) %>% summarise(median(HourlyRate))
## # A tibble: 2 × 2
##   Attrition `median(HourlyRate)`
##   <chr>                    <dbl>
## 1 No                        64.5
## 2 Yes                       68.5
CS2 %>% group_by(Attrition) %>% summarise(median(MonthlyRate))
## # A tibble: 2 × 2
##   Attrition `median(MonthlyRate)`
##   <chr>                     <dbl>
## 1 No                       14236.
## 2 Yes                      12651
CS2 %>% group_by(Attrition) %>% summarise(median(MonthlyIncome))
## # A tibble: 2 × 2
##   Attrition `median(MonthlyIncome)`
##   <chr>                       <dbl>
## 1 No                          5208.
## 2 Yes                         3171

#Looking at the median values of all the four variables, there seems to be a correlation between Salary and Attrition, that lower salary employees are more likely to leave

#Plotting Attrition by YearsSinceLastPromotion

CS2 %>%  ggplot(aes(x = YearsSinceLastPromotion, fill = Attrition)) + geom_bar() + ggtitle("Distribution of YearsSinceLastPromotion and Attrition") + ylab("Attrition")

CS2 %>% group_by(Attrition) %>% summarise(mean(YearsSinceLastPromotion))
## # A tibble: 2 × 2
##   Attrition `mean(YearsSinceLastPromotion)`
##   <chr>                               <dbl>
## 1 No                                   2.18
## 2 Yes                                  2.14
CS2 %>% filter(Attrition == "Yes") %>%ggplot(aes(x = YearsSinceLastPromotion, fill = Attrition)) + geom_bar() + ggtitle("Distribution of YearsSinceLastPromotion and Attrition") + ylab("Attrition")

#One would think that if an employee is not promoted for a long time, then they would leave, but the plot does not suggest that relation

#Plotting Attrition by TrainingTimesLastYear

CS2 %>%  ggplot(aes(x = TrainingTimesLastYear, fill = Attrition)) + geom_bar() + ggtitle("Distribution of TrainingTimesLastYear and Attrition") + ylab("Attrition")

CS2 %>% group_by(Attrition) %>% summarise(mean(TrainingTimesLastYear))
## # A tibble: 2 × 2
##   Attrition `mean(TrainingTimesLastYear)`
##   <chr>                             <dbl>
## 1 No                                 2.87
## 2 Yes                                2.65
CS2 %>% filter(Attrition == "Yes") %>%ggplot(aes(x = TrainingTimesLastYear, fill = Attrition)) + geom_bar() + ggtitle("Distribution of TrainingTimesLastYear and Attrition") + ylab("Attrition")

#Plot does not suggest that Training Time influences Attrition

#Plotting Attrition by PercentSalaryHike

CS2 %>% ggplot(aes(x = PercentSalaryHike, fill = Attrition)) + geom_bar() + ggtitle("Distribution of PercentSalaryHike and Attrition") + ylab("Attrition")

CS2 %>% group_by(Attrition) %>% summarise(median(PercentSalaryHike))
## # A tibble: 2 × 2
##   Attrition `median(PercentSalaryHike)`
##   <chr>                           <dbl>
## 1 No                                 14
## 2 Yes                                14
CS2 %>% filter(Attrition == "Yes") %>% ggplot(aes(x = PercentSalaryHike, fill = Attrition)) + geom_bar() + ggtitle("Distribution of PercentSalaryHike and Attrition") + ylab("Attrition")

#Looking at the data and plot, employees with lower Percent Salary Hike are more likely to quit

#Plotting Attrition by TotalWorkingYears

CS2 %>% ggplot(aes(x = TotalWorkingYears, fill = Attrition)) + geom_bar() + ggtitle("Distribution of TotalWorkingYears and Attrition") + ylab("Attrition")

CS2 %>% group_by(Attrition) %>% summarise(median(TotalWorkingYears))
## # A tibble: 2 × 2
##   Attrition `median(TotalWorkingYears)`
##   <chr>                           <dbl>
## 1 No                               10  
## 2 Yes                               6.5
CS2 %>% filter(Attrition == "Yes") %>% ggplot(aes(x = TotalWorkingYears, fill = Attrition)) + geom_bar() + ggtitle("Distribution of TotalWorkingYears and Attrition") + ylab("Attrition")

#Plot does suggest the employeers with more Total Working Years are less likely to quit

#Plotting Attrition by YearsAtCompany

CS2 %>% ggplot(aes(x = YearsAtCompany, fill = Attrition)) + geom_bar() + ggtitle("Distribution of YearsAtCompany and Attrition") + ylab("Attrition")

CS2 %>% group_by(Attrition) %>% summarise(median(YearsAtCompany))
## # A tibble: 2 × 2
##   Attrition `median(YearsAtCompany)`
##   <chr>                        <dbl>
## 1 No                               6
## 2 Yes                              3
CS2 %>% filter(Attrition == "Yes") %>% ggplot(aes(x = YearsAtCompany, fill = Attrition)) + geom_bar() + ggtitle("Distribution of YearsAtCompany and Attrition") + ylab("Attrition")

#Plot does suggest the employeers with more Years at the Compaany are less likely to quit

#Plotting Attrition by YearsInCurrentRole

CS2 %>% ggplot(aes(x = YearsInCurrentRole, fill = Attrition)) + geom_bar() + ggtitle("Distribution of YearsInCurrentRole and Attrition") + ylab("Attrition")

CS2 %>% group_by(Attrition) %>% summarise(median(YearsInCurrentRole))
## # A tibble: 2 × 2
##   Attrition `median(YearsInCurrentRole)`
##   <chr>                            <dbl>
## 1 No                                   3
## 2 Yes                                  2
CS2 %>% filter(Attrition == "Yes") %>% ggplot(aes(x = YearsInCurrentRole, fill = Attrition)) + geom_bar() + ggtitle("Distribution of YearsInCurrentRole and Attrition") + ylab("Attrition")

#Plot does suggest the employeers with less Years in current role are more likely to quit

#Plotting Attrition by JobLevel

CS2 %>% ggplot(aes(x = JobLevel, fill = Attrition)) + geom_bar() + ggtitle("Distribution of JobLevel and Attrition") + ylab("Attrition")

CS2 %>% filter(Attrition == "Yes") %>% ggplot(aes(x = JobLevel, fill = Attrition)) + geom_bar() + ggtitle("Distribution of JobLevel and Attrition") + ylab("Attrition")

#Plot does suggest the employees at Lower Job Level are more likely to quit

#Plotting Attrition by MaritalStatus

CS2 %>% ggplot(aes(x = MaritalStatus, fill = Attrition)) + geom_bar() + ggtitle("Distribution of MaritalStatus and Attrition") + ylab("Attrition")

CS2 %>% filter(Attrition == "Yes") %>% ggplot(aes(x = MaritalStatus, fill = Attrition)) + geom_bar() + ggtitle("Distribution of MaritalStatus and Attrition") + ylab("Attrition")

#Plot does suggest the single employees are slightly more likely to quit

#Plotting Attrition by JobSatisfaction

CS2 %>% ggplot(aes(x = JobSatisfaction, fill = Attrition)) + geom_bar() + ggtitle("Distribution of JobSatisfaction and Attrition") + ylab("Attrition")

CS2 %>% filter(Attrition == "Yes") %>% ggplot(aes(x = JobSatisfaction, fill = Attrition)) + geom_bar() + ggtitle("Distribution of JobSatisfaction and Attrition") + ylab("Attrition")

#Plot does suggest the level of Job Satisfaction is not playing a significant role in Attrition

#Plotting Attrition by RelationshipSatisfaction

CS2 %>% ggplot(aes(x = RelationshipSatisfaction, fill = Attrition)) + geom_bar() + ggtitle("Distribution of RelationshipSatisfaction and Attrition") + ylab("Attrition")

CS2 %>% filter(Attrition == "Yes") %>% ggplot(aes(x = RelationshipSatisfaction, fill = Attrition)) + geom_bar() + ggtitle("Distribution of RelationshipSatisfaction and Attrition") + ylab("Attrition")

#Plot does suggest the level of RelationshipSatisfaction is not playing a significant role in Attrition

#Plotting Attrition by NumCompaniesWorked

CS2 %>% ggplot(aes(x = NumCompaniesWorked, fill = Attrition)) + geom_bar() + ggtitle("Distribution of NumCompaniesWorked and Attrition") + ylab("Attrition")

CS2 %>% filter(Attrition == "Yes") %>% ggplot(aes(x = NumCompaniesWorked, fill = Attrition)) + geom_bar() + ggtitle("Distribution of NumCompaniesWorked and Attrition") + ylab("Attrition")

#Plot does suggest the level of Num of Companies worked is not playing a significant role in Attrition

#Plotting Attrition by BusinessTravel

CS2 %>% ggplot(aes(x = BusinessTravel, fill = Attrition)) + geom_bar() + ggtitle("Distribution of BusinessTravel and Attrition") + ylab("Attrition")

CS2 %>% filter(Attrition == "Yes") %>% ggplot(aes(x = BusinessTravel, fill = Attrition)) + geom_bar() + ggtitle("Distribution of BusinessTravel and Attrition") + ylab("Attrition")

#Plot does suggest the Business Travel is not influencing Attrition

#Plotting Attrition by StockOptionLevel

CS2 %>% ggplot(aes(x = StockOptionLevel, fill = Attrition)) + geom_bar() + ggtitle("Distribution of StockOptionLevel and Attrition") + ylab("Attrition")

CS2 %>% filter(Attrition == "Yes") %>% ggplot(aes(x = StockOptionLevel, fill = Attrition)) + geom_bar() + ggtitle("Distribution of StockOptionLevel and Attrition") + ylab("Attrition")

#Plot does suggest the employees with lower stock options are more likely to quit

##Reviewing several of the attributes and their relationship to Attrition, it looks there these are the ones that have a significant influence on Attrition. Age Income JobLevel PercentSalaryHike StockOptionLevel TotalWorkingYears YearsAtACompany YearsInCurrentRole

#Plot Percentage Attrition by JobLevel filled by StockOptionLevel

CS2 %>% group_by(StockOptionLevel, JobLevel, Attrition) %>% summarise(JRCount = n()) %>% group_by(JobLevel) %>% mutate(PA = JRCount/sum(JRCount)) %>% filter(Attrition == "Yes") %>% ggplot(aes(x = JobLevel, y = PA,fill = StockOptionLevel)) + geom_bar(stat="identity") + ggtitle("Distribution of JobLevel by  Attrition") + ylab("Attrition") + xlab("JobLevel")
## `summarise()` has grouped output by 'StockOptionLevel', 'JobLevel'. You can
## override using the `.groups` argument.

#Plot shows highest % of attrition at 26% for Job Level 1

#Plot Percentage Attrition by JobRole filled by StockOptionLevel

CS2 %>% group_by(StockOptionLevel, JobRole, Attrition) %>% summarise(JRCount = n()) %>% group_by(JobRole) %>% mutate(PA = JRCount/sum(JRCount)) %>% filter(Attrition == "Yes") %>% ggplot(aes(x = JobRole, y = PA,fill = StockOptionLevel)) + geom_bar(stat="identity") + ggtitle("Distribution of JobRole by  Attrition") + ylab("Attrition") + xlab("JobRole")
## `summarise()` has grouped output by 'StockOptionLevel', 'JobRole'. You can
## override using the `.groups` argument.

#Plot shows highest % of attrition at 45% for Job Role Sales Representatives

#Plot Percentage Attrition by JobRole filled by StockOptionLevel

CS2 %>% group_by(StockOptionLevel, JobLevel, Attrition) %>% summarise(JRCount = n()) %>% group_by(JobLevel) %>% mutate(PA = JRCount/sum(JRCount)) %>% filter(Attrition == "Yes") %>% ggplot(aes(x = JobLevel, y = PA,fill = StockOptionLevel)) + geom_bar(stat="identity") + ggtitle("Distribution of JobLevel by  Attrition") + ylab("Attrition") + xlab("JobLevel")
## `summarise()` has grouped output by 'StockOptionLevel', 'JobLevel'. You can
## override using the `.groups` argument.

#Plot shows highest % of attrition at 26% for Job Level 1

#Plot Percentage Attrition by PercentSalaryHike filled by JobLevel

CS2 %>% group_by(JobLevel, PercentSalaryHike, Attrition) %>% summarise(JRCount = n()) %>% group_by(PercentSalaryHike) %>% mutate(PA = JRCount/sum(JRCount)) %>% filter(Attrition == "Yes") %>% ggplot(aes(x = PercentSalaryHike, y = PA,fill = JobLevel)) + geom_bar(stat="identity") + ggtitle("Distribution of PercentSalaryHike by  Attrition") + ylab("Attrition") + xlab("PercentSalaryHike")
## `summarise()` has grouped output by 'JobLevel', 'PercentSalaryHike'. You can
## override using the `.groups` argument.

#Plot shows highest % of Salary Hikes were given to employees at Job Level 1 and 2

#After using the above variables in the KNN and NB models, I have chosen the following explanotary vairables to predit Attrition

#JobLevel , StockOptionLevel and YearsAtCompany

#These three variables gave balanced values for Accuracy, Specificity and Sensitivity

#Trainign using KNN Model with probability of 50% and 30% for the dataset as given, which is unbalanced

classifications = knn(CS2[,c(16,29,33)],CS2[c(16,29,33)],CS2[,3], prob = TRUE, k = 10)

table(classifications,CS2[,3])
##                
## classifications  No Yes
##             No  711 112
##             Yes  19  28
CM = confusionMatrix(table(classifications,CS2[,3]))
CM
## Confusion Matrix and Statistics
## 
##                
## classifications  No Yes
##             No  711 112
##             Yes  19  28
##                                           
##                Accuracy : 0.8494          
##                  95% CI : (0.8239, 0.8725)
##     No Information Rate : 0.8391          
##     P-Value [Acc > NIR] : 0.2176          
##                                           
##                   Kappa : 0.2378          
##                                           
##  Mcnemar's Test P-Value : 9.126e-16       
##                                           
##             Sensitivity : 0.9740          
##             Specificity : 0.2000          
##          Pos Pred Value : 0.8639          
##          Neg Pred Value : 0.5957          
##              Prevalence : 0.8391          
##          Detection Rate : 0.8172          
##    Detection Prevalence : 0.9460          
##       Balanced Accuracy : 0.5870          
##                                           
##        'Positive' Class : No              
## 
#Get probs
probs = ifelse(classifications == "Yes",attributes(classifications)$prob, 1- attributes(classifications)$prob)

#Threshold
NewClass = ifelse(probs > .3, "Yes", "No")
table(NewClass,CS2[,3])
##         
## NewClass  No Yes
##      No  647  81
##      Yes  83  59
CM = confusionMatrix(table(NewClass,CS2[,3]))
CM
## Confusion Matrix and Statistics
## 
##         
## NewClass  No Yes
##      No  647  81
##      Yes  83  59
##                                          
##                Accuracy : 0.8115         
##                  95% CI : (0.7839, 0.837)
##     No Information Rate : 0.8391         
##     P-Value [Acc > NIR] : 0.9868         
##                                          
##                   Kappa : 0.306          
##                                          
##  Mcnemar's Test P-Value : 0.9378         
##                                          
##             Sensitivity : 0.8863         
##             Specificity : 0.4214         
##          Pos Pred Value : 0.8887         
##          Neg Pred Value : 0.4155         
##              Prevalence : 0.8391         
##          Detection Rate : 0.7437         
##    Detection Prevalence : 0.8368         
##       Balanced Accuracy : 0.6539         
##                                          
##        'Positive' Class : No             
## 
# ROC
## Warning: package 'ROCit' was built under R version 3.5.2
ROCit_obj_fifty <- rocit(score=as.numeric(classifications),class=CS2[,3])
plot(ROCit_obj_fifty)

ROCit_obj_fifty$AUC
## [1] 0.5869863
## Warning: package 'ROCit' was built under R version 3.5.2
ROCit_obj_thirty <- rocit(score=as.numeric(as.factor(NewClass)),class=CS2[,3])
plot(ROCit_obj_thirty)

ROCit_obj_thirty$AUC
## [1] 0.653865

#using a probability of 30% is giving better results than 50%

#Training using Naive Bayes model for the given unbalanced dataset with a 70/30 split

splitPerc = .7 #Training / Test split Percentage
  trainI = sample(seq(1:length(CS2$ID)),round(.7*length(CS2$ID)))
train = CS2[trainI,]
test = CS2[-trainI,]
model = naiveBayes(train[,c(16,29,33)],train$Attrition)
table(predict(model,test[,c(16,29,33)]),test$Attrition)
##      
##        No Yes
##   No  222  39
##   Yes   0   0
CM = confusionMatrix(table(predict(model,test[,c(16,29,33)]),test$Attrition))
CM
## Confusion Matrix and Statistics
## 
##      
##        No Yes
##   No  222  39
##   Yes   0   0
##                                           
##                Accuracy : 0.8506          
##                  95% CI : (0.8014, 0.8915)
##     No Information Rate : 0.8506          
##     P-Value [Acc > NIR] : 0.5426          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 1.166e-09       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.8506          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.8506          
##          Detection Rate : 0.8506          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : No              
## 

#Undersampling and using KNN and NB to predit

summary(CS2$Attrition)
##    Length     Class      Mode 
##       870 character character
CS2 %>% group_by(Attrition) %>% summarise(n())
## # A tibble: 2 × 2
##   Attrition `n()`
##   <chr>     <int>
## 1 No          730
## 2 Yes         140
OnlyNo = CS2 %>% filter(Attrition == "No")
OnlyNoUnder = OnlyNo[sample(seq(1,730,1),140),]

UnderSamp = rbind(CS2 %>% filter(Attrition == "Yes"), OnlyNoUnder)
dim(UnderSamp)
## [1] 280  36
splitPerc = .7 #Training / Test split Percentage
  trainI = sample(seq(1:length(UnderSamp$ID)),round(.7*length(UnderSamp$ID)))
train = UnderSamp[trainI,]
test = UnderSamp[-trainI,]
model = naiveBayes(train[,c(16,29,33)],train$Attrition)
table(predict(model,test[,c(16,29,33)]),test$Attrition)
##      
##       No Yes
##   No  19   8
##   Yes 22  35
CM = confusionMatrix(table(predict(model,test[,c(16,29,33)]),test$Attrition))
CM
## Confusion Matrix and Statistics
## 
##      
##       No Yes
##   No  19   8
##   Yes 22  35
##                                           
##                Accuracy : 0.6429          
##                  95% CI : (0.5308, 0.7445)
##     No Information Rate : 0.5119          
##     P-Value [Acc > NIR] : 0.01057         
##                                           
##                   Kappa : 0.2796          
##                                           
##  Mcnemar's Test P-Value : 0.01762         
##                                           
##             Sensitivity : 0.4634          
##             Specificity : 0.8140          
##          Pos Pred Value : 0.7037          
##          Neg Pred Value : 0.6140          
##              Prevalence : 0.4881          
##          Detection Rate : 0.2262          
##    Detection Prevalence : 0.3214          
##       Balanced Accuracy : 0.6387          
##                                           
##        'Positive' Class : No              
## 
classifications = knn(UnderSamp[,c(16,29,33)],UnderSamp[c(16,29,33)],UnderSamp[,3], prob = TRUE, k = 5)

table(classifications,UnderSamp[,3])
##                
## classifications  No Yes
##             No  100  41
##             Yes  40  99
CM = confusionMatrix(table(classifications,UnderSamp[,3]))
CM
## Confusion Matrix and Statistics
## 
##                
## classifications  No Yes
##             No  100  41
##             Yes  40  99
##                                           
##                Accuracy : 0.7107          
##                  95% CI : (0.6538, 0.7631)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 6.299e-13       
##                                           
##                   Kappa : 0.4214          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.7143          
##             Specificity : 0.7071          
##          Pos Pred Value : 0.7092          
##          Neg Pred Value : 0.7122          
##              Prevalence : 0.5000          
##          Detection Rate : 0.3571          
##    Detection Prevalence : 0.5036          
##       Balanced Accuracy : 0.7107          
##                                           
##        'Positive' Class : No              
## 
ROCit_obj <- rocit(score=as.numeric(classifications),class=UnderSamp[,3])
plot(ROCit_obj)

ROCit_obj$AUC
## [1] 0.7107143

#Oversampling and using KNN and NB

OnlyYes = CS2 %>% filter(Attrition == "Yes")
OnlyYesOver = rbind(OnlyYes,OnlyYes[sample(seq(1,140,1),(730-140),replace = TRUE),])
dim(OnlyYesOver)
## [1] 730  36
OverSamp = rbind(CS2 %>% filter(Attrition == "No"), OnlyYesOver)
dim(OverSamp)
## [1] 1460   36
splitPerc = .7 #Training / Test split Percentage
  trainI = sample(seq(1:length(OverSamp$ID)),round(.7*length(OverSamp$ID)))
train = OverSamp[trainI,]
test = OverSamp[-trainI,]
model = naiveBayes(train[,c(16,29,33)],train$Attrition)
table(predict(model,test[,c(16,29,33)]),test$Attrition)
##      
##        No Yes
##   No  109  67
##   Yes  98 164
CM = confusionMatrix(table(predict(model,test[,c(16,29,33)]),test$Attrition))
CM
## Confusion Matrix and Statistics
## 
##      
##        No Yes
##   No  109  67
##   Yes  98 164
##                                           
##                Accuracy : 0.6233          
##                  95% CI : (0.5761, 0.6688)
##     No Information Rate : 0.5274          
##     P-Value [Acc > NIR] : 3.215e-05       
##                                           
##                   Kappa : 0.2384          
##                                           
##  Mcnemar's Test P-Value : 0.01952         
##                                           
##             Sensitivity : 0.5266          
##             Specificity : 0.7100          
##          Pos Pred Value : 0.6193          
##          Neg Pred Value : 0.6260          
##              Prevalence : 0.4726          
##          Detection Rate : 0.2489          
##    Detection Prevalence : 0.4018          
##       Balanced Accuracy : 0.6183          
##                                           
##        'Positive' Class : No              
## 
classifications = knn(OverSamp[,c(16,29,33)],OverSamp[c(16,29,33)],OverSamp[,3], prob = TRUE, k = 5)

table(classifications,OverSamp[,3])
##                
## classifications  No Yes
##             No  485 108
##             Yes 245 622
CM = confusionMatrix(table(classifications,OverSamp[,3]))
CM
## Confusion Matrix and Statistics
## 
##                
## classifications  No Yes
##             No  485 108
##             Yes 245 622
##                                         
##                Accuracy : 0.7582        
##                  95% CI : (0.7354, 0.78)
##     No Information Rate : 0.5           
##     P-Value [Acc > NIR] : < 2.2e-16     
##                                         
##                   Kappa : 0.5164        
##                                         
##  Mcnemar's Test P-Value : 4.535e-13     
##                                         
##             Sensitivity : 0.6644        
##             Specificity : 0.8521        
##          Pos Pred Value : 0.8179        
##          Neg Pred Value : 0.7174        
##              Prevalence : 0.5000        
##          Detection Rate : 0.3322        
##    Detection Prevalence : 0.4062        
##       Balanced Accuracy : 0.7582        
##                                         
##        'Positive' Class : No            
## 
ROCit_obj <- rocit(score=as.numeric(classifications),class=OverSamp[,3])
plot(ROCit_obj)

ROCit_obj$AUC
## [1] 0.7582192

#Comparison of the models and methods wrt Accuracy, Sensitivity and Specificity

# Read data from the Excel file
Comprison <- read_excel("/Users/Banu/Documents/SMU MSDS/MSDS_6306_DoingDataScience/Unit14/Comparison.xlsx")

# Display the data as a table using kable
kable(Comprison)
Attribute KNNUnbalanced KNNUndersampling KNNOversampling NBUnbalanced NBUndersampling NBOversampling
Accuracy 0.8115 0.7071 0.7795 0.8621 0.6310 0.6598
Sensitivity 0.8863 0.7357 0.6849 1.0000 0.5652 0.5980
Specificity 0.4214 0.6786 0.8740 0.0000 0.7105 0.7113

#Predit the Test Data using the KNN OverSampling Model

CS2Test1 <- CS2Test1 %>% mutate(Attrition = knn(OverSamp[,c(16,29,33)],CS2Test1[c(15,28,32)],OverSamp[,3], prob = TRUE, k = 5))

#Plot Percentage Attrition by JobRole filled by StockOptionLevel

CS2Test1 %>% group_by(StockOptionLevel, JobLevel, Attrition) %>% summarise(JRCount = n()) %>% group_by(JobLevel) %>% mutate(PA = JRCount/sum(JRCount)) %>% filter(Attrition == "Yes") %>% ggplot(aes(x = JobLevel, y = PA,fill = StockOptionLevel)) + geom_bar(stat="identity") + ggtitle("Distribution of JobLevel by  Attrition") + ylab("Attrition") + xlab("JobLevel")
## `summarise()` has grouped output by 'StockOptionLevel', 'JobLevel'. You can
## override using the `.groups` argument.

CS2Attrition <- CS2Test1 %>% select(ID, Attrition)

#write the predictions to a file
write.csv(CS2Attrition, file = "/Users/Banu/Documents/SMU MSDS/MSDS_6306_DoingDataScience/Unit14/Case2PredictionsPullaiahnaiduAttrition.csv", row.names = FALSE)

#Predict Salary

CS2 %>% 
  ggplot(aes(x = YearsAtCompany, y = MonthlyIncome)) + geom_point() + ggtitle("CS2: MonthlyIncome v. YearsAtCompany") + geom_smooth(method = "lm") 
## `geom_smooth()` using formula = 'y ~ x'

CS2 %>% 
  ggplot(aes(x = JobLevel, y = MonthlyIncome)) + geom_point() + ggtitle("CS2: MonthlyIncome v. JobLevel") + geom_smooth(method = "lm") 
## `geom_smooth()` using formula = 'y ~ x'

CS2 %>% 
  ggplot(aes(x = TotalWorkingYears, y = MonthlyIncome)) + geom_point() + ggtitle("CS2: MonthlyIncome v. TotalWorkingYears") + geom_smooth(method = "lm") 
## `geom_smooth()` using formula = 'y ~ x'

Model_fit = lm(MonthlyIncome~JobLevel, data = CS2)
summary(Model_fit)
## 
## Call:
## lm(formula = MonthlyIncome ~ JobLevel, data = CS2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5037.1  -928.2    80.1   697.1  3723.6 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1793.93     101.68  -17.64   <2e-16 ***
## JobLevel     4013.67      43.98   91.26   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1413 on 868 degrees of freedom
## Multiple R-squared:  0.9056, Adjusted R-squared:  0.9055 
## F-statistic:  8329 on 1 and 868 DF,  p-value: < 2.2e-16
confint(Model_fit)
##                 2.5 %    97.5 %
## (Intercept) -1993.494 -1594.375
## JobLevel     3927.352  4099.990
Model_Preds = predict(Model_fit, newdata = CS2)
#as.data.frame(Model_Preds)
RMSE = sqrt(mean((CS2$MonthlyIncome - Model_Preds)^2))
RMSE
## [1] 1411.67
Model1_fit = lm(MonthlyIncome~JobLevel+TotalWorkingYears, data = CS2)
summary(Model1_fit)
## 
## Call:
## lm(formula = MonthlyIncome ~ JobLevel + TotalWorkingYears, data = CS2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5469.9  -876.8    64.5   728.3  3937.5 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1798.38      99.98 -17.987  < 2e-16 ***
## JobLevel           3714.12      69.21  53.664  < 2e-16 ***
## TotalWorkingYears    55.66      10.04   5.544 3.94e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1390 on 867 degrees of freedom
## Multiple R-squared:  0.9088, Adjusted R-squared:  0.9086 
## F-statistic:  4322 on 2 and 867 DF,  p-value: < 2.2e-16
confint(Model1_fit)
##                         2.5 %      97.5 %
## (Intercept)       -1994.61016 -1602.14196
## JobLevel           3578.28325  3849.96123
## TotalWorkingYears    35.95588    75.37198
Model1_Preds = predict(Model1_fit, newdata = CS2)
#as.data.frame(Model1_Preds)
RMSE1 = sqrt(mean((CS2$MonthlyIncome - Model1_Preds)^2))
RMSE1
## [1] 1387.298
Model2_fit = lm(MonthlyIncome~JobLevel+TotalWorkingYears+YearsAtCompany, data = CS2)
summary(Model2_fit)
## 
## Call:
## lm(formula = MonthlyIncome ~ JobLevel + TotalWorkingYears + YearsAtCompany, 
##     data = CS2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5625.1  -888.5    42.3   725.5  3968.3 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1764.37     100.04 -17.637  < 2e-16 ***
## JobLevel           3724.98      68.94  54.035  < 2e-16 ***
## TotalWorkingYears    70.76      11.07   6.394 2.64e-10 ***
## YearsAtCompany      -32.04      10.11  -3.170  0.00158 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1383 on 866 degrees of freedom
## Multiple R-squared:  0.9099, Adjusted R-squared:  0.9096 
## F-statistic:  2915 on 3 and 866 DF,  p-value: < 2.2e-16
confint(Model2_fit)
##                         2.5 %      97.5 %
## (Intercept)       -1960.72215 -1568.02112
## JobLevel           3589.67851  3860.28464
## TotalWorkingYears    49.04284    92.48534
## YearsAtCompany      -51.87443   -12.20068
Model2_Preds = predict(Model2_fit, newdata = CS2)
#as.data.frame(Model2_Preds)
RMSE2 = sqrt(mean((CS2$MonthlyIncome - Model2_Preds)^2))
RMSE2
## [1] 1379.319

#Choosing to predit the test dataset using Model2 since it has a best RMSE

Model1_fit = lm(MonthlyIncome~JobLevel+TotalWorkingYears+YearsAtCompany, data = CS2)
summary(Model1_fit)
## 
## Call:
## lm(formula = MonthlyIncome ~ JobLevel + TotalWorkingYears + YearsAtCompany, 
##     data = CS2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5625.1  -888.5    42.3   725.5  3968.3 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1764.37     100.04 -17.637  < 2e-16 ***
## JobLevel           3724.98      68.94  54.035  < 2e-16 ***
## TotalWorkingYears    70.76      11.07   6.394 2.64e-10 ***
## YearsAtCompany      -32.04      10.11  -3.170  0.00158 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1383 on 866 degrees of freedom
## Multiple R-squared:  0.9099, Adjusted R-squared:  0.9096 
## F-statistic:  2915 on 3 and 866 DF,  p-value: < 2.2e-16
confint(Model1_fit)
##                         2.5 %      97.5 %
## (Intercept)       -1960.72215 -1568.02112
## JobLevel           3589.67851  3860.28464
## TotalWorkingYears    49.04284    92.48534
## YearsAtCompany      -51.87443   -12.20068
Model1_Preds = predict(Model1_fit, newdata = CS2Test2)
#as.data.frame(Model1_Preds)
MSPE1 = sqrt(mean((CS2$MonthlyIncome - Model1_Preds)^2))
## Warning in CS2$MonthlyIncome - Model1_Preds: longer object length is not a
## multiple of shorter object length
MSPE1
## [1] 6000.538

#Write the Salary predictions to a file

MonthlyIncome = predict(Model1_fit, newdata = CS2Test2)
Model1_Preds_df <- as.data.frame(round(MonthlyIncome))
names(Model1_Preds_df) <- "MonthlyIncome"
CS2Test2_pred <- cbind(CS2Test2, Model1_Preds_df)

CS2Test2_pred %>% 
  ggplot(aes(x = YearsAtCompany, y = MonthlyIncome)) + geom_point() + ggtitle("CS2: MonthlyIncome v. YearsAtCompany") + geom_smooth(method = "lm") 
## `geom_smooth()` using formula = 'y ~ x'

CS2Test2_pred %>% 
  ggplot(aes(x = JobLevel, y = MonthlyIncome)) + geom_point() + ggtitle("CS2: MonthlyIncome v. JobLevel") + geom_smooth(method = "lm") 
## `geom_smooth()` using formula = 'y ~ x'

CS2Test2_pred %>% 
  ggplot(aes(x = TotalWorkingYears, y = MonthlyIncome)) + geom_point() + ggtitle("CS2: MonthlyIncome v. TotalWorkingYears") + geom_smooth(method = "lm") 
## `geom_smooth()` using formula = 'y ~ x'

CS2Salary <- CS2Test2_pred %>% select(ID, MonthlyIncome)

#write the predictions to a file
write.csv(CS2Salary, file = "/Users/Banu/Documents/SMU MSDS/MSDS_6306_DoingDataScience/Unit14/Case2PredictionsPullaiahnaiduSalary.csv", row.names = FALSE)